#!/usr/bin/python
# -*- coding: cp1251 -*-

# TODO: Create Index
r"""
NAME:
    pyDSL2Sqlite v. 0.12; last update: 07-Jan-2008

AUTHOR:
    zhuman 2012

DESCRIPTION:
    Parse Lingvo DSL source file (in Unicode UCS2-LE format or ANSI)
    and:
    1. Detect entry and field boundaries
    2. Convert DSL markup to HTML, according to convresion tables (DSL2HTM_DICT_xxxx)
    3. Save entries to SQLite database (in UTF-8 encoding).


TODO !!!:
    Update conversion tables
    Check for locked databse: should not be run while CGI application is runnning
    ADD <br> for each line
    Add 'create index' on after import
FIXME:
    [[SUP]] not processed
"""

import sys
import os
import re
import codecs
import time
import sqlite3

"""
Configration settings for pyDSL2HTM
"""
SQLITE_DB_FILE = "sqlite3.db"

MAX_HITS = 15

ENTRY_SEPARATOR = os.linesep + '<!--~-->'

FIELD_SEPARATOR = '<!--=-->'

# Separator for line breaks within the definition
NEW_LINE_MARKER = '<dd>'

END_OF_LINE = '\n'

# Strings to replace in HEADWORD only
HEADWORD_REPL = {
    u'{\u2027}':'',  # for Collins 2006
    u'{\u00B7}':'',  # for Webster New World Unabridged
    r"{[']}":'',
    r"{[/']}":'',
    '\\':''
}

# Strings to replace on first pass
DSL2HTM_DICT_1ST_PASS = {
#    r'{{link}}<<':'<<',   # for Spanish Larousse
#    r'>>{{/link}}':'>>',   # ibid.
    '<<':'[ref]',
    '>>':'[/ref]',
}

# Strings to replace on second pass
DSL2HTM_DICT_2ND_PASS = {
    '>':'&gt;',
    '<':'&lt;',
    r'[!trs]':'',
    r'[/!trs]':''
}

## tk callback logger hook
callback = None
_in_file = None
_db = None
# Tags to replace on third pass: m,c,trn,t,com,p,ex,ref, <<, >>
DSL2HTM_DICT_3RD_PASS = {
    # opening tags to SPANs
    re.compile('\[(m|c|trn|t|com|p|ex|ref)\]'):'<span class="\\1">',
    re.compile('(\[\*\]|\{\{)'):'<span class="star">',       # '{{' and '[*]'
    re.compile('\[\'\]'):'<span class="stress">',
    re.compile('(\[ref\s[^\]]+\]|<<)'):'<span class="ref">',

    # closing SPANs
    re.compile('>>|\}\}'):'</span>', # '}}' and '>>'
    re.compile('\[/(?:c|m|trn|t|com|p|ex|ref|\'|\*|m)\]'):'</span>',

    #COLOR
    re.compile('\[c ([\w]+)\]'):'<span class="col_\\1">',

    #Images (add path var to substitute with real value later)
    # ex: [s]image.jpg[/s] to <img src="/dicpath/image.jpg">
    re.compile('\[s\]([\S]+(?:jpg|png))\[/s\]'):'<img src="${imgdir}\\1">',
    re.compile('\[s\]([\S]+wav)\[/s\]'):'<a class="sound" href ="${imgdir}\\1">play</a>',

    # TO SELF: sub, sup, u, i, b (e.g. [sub] to <sub>)
    re.compile('\[(sup|sub|u|i|b)\]'):'<\\1>',
    re.compile('\[/(sup|sub|u|i|b)\]'):'</\\1>',

    # Indentation: m1, m2, m3, m4,... (e.g. [m1] to <span class="m1">)
    re.compile('\[m(\d)\]'):'<span class="m\\1">',

    # remove TAGS (replace with nothing)
    re.compile('\[/?(?:trn|lang)\]'):'',
    re.compile('\[lang\s[^\]]*\]'):'',

    # Remove escapes
    re.compile(r'\\'):'',

    # Remove dictionary specific strings
    re.compile('Trimis de.*Sursa:\s*' ): ''  # rodex
    #'\t':'',
}

# ################################################################
# ################################################################
# #####################END OF CONFIG SECTION######################
# ################################################################
# ################################################################


# DEFAULT ENCODINGS (detected at run-time, fall back to utf8)
INPUT_ENCODING  = ''
OUTPUT_ENCODING = ''
DIRTY_CONVERSION = False

# SQLite
DB_NAME = None
TABLE_NAME = None

def Usage():
    print '''USAGE:
    %s  [-i Encoding] [-o Encoding] [-t table] [-d dbfile] <webster.dsl>
    Where:
    -i, --input-encoding  = input file encoding
    -o, --output-encoding = output file encoding
    -t, --table-name      = table name (filename used if none)
    -d, --database-file   = the sqlite3 db file (default is sqlite3.db)

    Possible encodings:
    utf8        8-bit variable length encoding
    utf_16      16-bit variable length encoding (little/big endian)
    utf_16_le   utf-16 but explicitly little endian
    utf_16_be   utf-16 but explicitly big endian
    cp1251      Windows Cyrillic
    cp866       OEM Cyrillic (DOS)
    ascii       7-bit ASCII codepage
    iso-8859-1  ISO 8859-1 (Latin 1) codepage
    For a complete list of encodings see:
    http://docs.python.org/lib/standard-encodings.html

EXAMPLES:

    "pyDsl2Sqlite.py" -d sqlite3.db -t oxford oxford.dsl
    imports DSL file to SQLite database (sqlite3.db)
''' % os.path.split(sys.argv[0])[-1]
    sys.exit(-1)

class sqliteDB(object):
    "Wrapper for SQLite methods"
##    self.con = None
##    self.cur = None
    def __init__(self, db_file= SQLITE_DB_FILE):
        'Intialize SQLite database'
        logCallback( 'SQLite db file: ' + db_file)
        self.con = sqlite3.connect(db_file)
        self.cur = self.con.cursor()
        self.table_name = None
        #return con, cur


    def create_table(self, table = None):
        "db.create_table table_name"
        if table:
            self.table_name = table
        drop ='DROP TABLE IF EXISTS %s' % self.table_name
        create ='CREATE TABLE %s (headword VARCHAR(256) PRIMARY KEY, definition TEXT)' % self.table_name
        try:
            self.cur.execute(drop)
            self.con.commit()
            self.cur.execute(create)
            self.con.commit()
        except Exception, e:
            logCallback(str(e))

    def set_table_name(self, tablename):
        self.table_name = tablename


    def add_record (self, headWord, definition):
        # query ="INSERT INTO %s (hword, definition) VALUES('%s', '%s')" % (table, headWord, definition)
        try:
            self.cur.execute('INSERT INTO ' + self.table_name + '(headword, definition) VALUES(?, ?)', (headWord, definition))
            #self.con.commit()
        except Exception,  e:
            logCallback(e)


    def get_records (self, searchWord, hits=MAX_HITS):
        try:
            # cur.execute('INSERT INTO dict1 (hword, definition) VALUES(?, ?)',headWord, definition)

            self.cur.execute("select * from " + self.table_name + " where headword like '?%'")

        except Exception, e:
            logCallback(e)

        return self.cur.fetchmany(hits)

    def commit (self):
        self.con.commit()

    def close(self):
        if self.con:
            self.con.close()


def parseCommandLine():
    '''process command line parameters'''
    import getopt
    global INPUT_ENCODING, OUTPUT_ENCODING, TABLE_NAME, DB_NAME
    try:
        opts, args = getopt.getopt ( sys.argv[1:], "i:o:t:d:h",
        ["input-encoding=",
        "output-encoding=",
        "table-name",
        "db-name",
        "help"
        ]
        )
    except getopt.GetoptError:
        # print help information and exit
        usage()
        sys.exit(2)
    #output = None
    #verbose = False
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            Usage()
            #sys.exit()
        if opt in ("-i", "--input-encoding"):
            INPUT_ENCODING = arg
        if opt in ("-o", "--output-encoding"):
            OUTPUT_ENCODING = arg
#        if opt in ("-p", "--pretty"):
#            PRETTY_HTML = True
#        if opt in ("-r", "--raw"):
#            PRETTY_HTML = False
        if opt in ("-t", "--table-name"):
            TABLE_NAME = arg
        if opt in ("-d", "--db-name"):
            DB_NAME = arg
    #print '''DBG: opts=%s\nargs=%s''' % (opts, args)
    return args


def isFileUnicode(file_name, BOMbyte='\xFF'):
    '''Detect if current file is Unicode by presence of BOM or nulls'''
    f = open(file_name, 'rb')
    BOM_OR_NOBOM = f.read(1) == BOMbyte and True or False
    f.close()
    return BOM_OR_NOBOM

def detectLineTerminator(file_name):
    try:
        f = open(file_name, 'rb')
    except IOError:
        logCallback( "Error: no such file: '%s'" % file_name)
        sys.exit(-1)
    line = f.readline()
    LINE_TERMINATOR =""
    if '\r' in line:
        LINE_TERMINATOR += '\r'
    if '\n' in line:
        LINE_TERMINATOR += '\n'
    f.close()
    return LINE_TERMINATOR


def str_replace( text, dic ):
    keys = dic.keys()
    keys.sort()
    for n in keys:
        text = text.replace(n,dic[n])
    return text

def str_replace_re( text, dic ):
    """do multiple replace in text according
    to key-value pairs in dic"""

    for pat, repl in dic.iteritems():
        text = re.sub(pat, repl, text) #  text.replace(n,dic[n])
    return text

##def multiple_replace(text, dic):
##
##  """ Replace in 'text' all occurences of any key in the given
##  dictionary by its corresponding value.  Returns the new string."""
##
##  # Create a regular expression  from the dictionary keys
##  regex = re.compile("(%s)" % "|".join(map(re.escape, dic.keys())))
##  #regex = re.compile("(%s)" % "|".join( dic.keys()))
##  # For each match, look-up corresponding value in dictionary
##  return regex.sub(lambda mo: dic[mo.string[mo.start():mo.end()]], text)
##
### data = codecs.open (, 'r', 'utf_16_le').read()

def doDsl2HtmlConv(data):

    # 1st pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_1ST_PASS)

    if r'{{link}}<<' in data:
        print "DBG:::::", data

    # 2nd pass replacement...'
    data = str_replace(data, DSL2HTM_DICT_2ND_PASS)

    # 3rd pass replacement... (regex)'
    data = str_replace_re(data, DSL2HTM_DICT_3RD_PASS)

    # 4th pass: add HTML line breaks
    data = data.replace(os.linesep, '<br>' + os.linesep)

    return data

def logCallback(data):
#    stop = False
    if (callback):
        if callback.func_dict['TERMINATE'] == True:
            #!!!!!!! CLEAN_UP()
            print 'Got Terminate REQUEST!'
            callback('Conversion aborted', True, True)
            cleanExit()
#            sys.exit()
        callback(data)
    if sys.stdout:
        print (data)

def cleanExit():
    global _in_file, _db, callback
    if _in_file:
        _in_file.close()
    if _db:
        _db.close()
    if callback:
        callback.func_dict['OPERATION_COMPLETED'] = 1
    sys.exit()

def convert_entry_to_html(hword, defn):
    ''' convert headword and definition to HTML according to replace table
        return results as tuple
    '''

    return str_replace(hword, HEADWORD_REPL), doDsl2HtmlConv(defn)

def writeEntry(db, headWord, definition):
    '''Write an HTML formatted entry to output file'''

    #!definition = definition.replace(END_OF_LINE, os.linesep + NEW_LINE_MARKER)

    # remove escapes in headwords
    #!if '\\' in headWord:
    #!    headWord = headWord.replace('\\', '')

    #!entry = ENTRY_SEPARATOR + headWord + FIELD_SEPARATOR + NEW_LINE_MARKER + definition

    #sqlite_add_record(con,cur, table, headWord, definition)
    db.add_record(headWord, definition)

# #######################################################
# #####################    MAIN    ######################
# #######################################################

def convert_dsl(input_file=None, output_file=None, table_name=None, callbackLog=None, encoding=None):
    global INPUT_ENCODING
    global OUTPUT_ENCODING
    global END_OF_LINE
    global TABLE_NAME

    if encoding != 'auto':
        INPUT_ENCODING = encoding

    global callback

    callback = callbackLog

    if not INPUT_ENCODING or not OUTPUT_ENCODING:

        if isFileUnicode(input_file) or isFileUnicode(input_file, '\x00'): # Unicode UCS2 little endian \xFF
            INPUT_ENCODING  = 'utf_16'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'utf-8'

        elif isFileUnicode(input_file, '\xEF'):  # UTF-8? '\xEF'
            INPUT_ENCODING  = 'utf-8'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'utf-8'

        else:
            INPUT_ENCODING  = 'utf-8'
            if not OUTPUT_ENCODING:
                OUTPUT_ENCODING = 'utf-8'

    END_OF_LINE = detectLineTerminator(input_file)

    in_file = codecs.open(input_file, 'rb', encoding=INPUT_ENCODING)

    _in_file = in_file # for cleanup
    print 'Reading input file:\n%s ("%s")...\n' % (os.path.basename(input_file), INPUT_ENCODING)

    # codecs.BOM_UTF16_LE:
    # ############################################


    # ############################################
    # init vars for main
    # ############################################
    headWord = ""
    nextHeadWord = ""
    definition = ""
    input_encoding='utf_16_le'
    output_encoding='utf8'
    endOfEntry = False
    entryCount = 0

    # ############################################
    # sqlite databse init
    # ############################################

    db_name = DB_NAME or output_file

    if DB_NAME:
        db = sqliteDB(db_name)
    else:
        db = sqliteDB()

    _db = db # for cleanup

    table_name = table_name or TABLE_NAME

    if not table_name:
        table_auto = os.path.splitext(os.path.basename(input_file))[0]
        table = raw_input('''
    Specify SQLite table name
    (press ENTER to accept current name)
    Current name: %s
    New name: ''' % table_auto).strip() # skip leading/trailing spaces

        if not table: table_name = table_auto

    print "Processing started at:\t" + time.strftime("%H:%M:%S", time.localtime())
    print "For big files conversion might take a long time...\nPlease wait..."
    startTime = time.time()

    try:
        db.create_table(table_name)
    except Exception, e:
        print logCallback(e)

    for line in in_file:

        line_stripped = line.strip()

        if not len(line_stripped):
            continue

        ## skip comments
        if line[0] == ("#"):
            continue

        if line[0] not in [" ","\t"]: # is a HEADWORD
            if not headWord:
                headWord = line_stripped

            else:  # reached the end of entry
                endOfEntry = True

                nextHeadWord = line_stripped # save next headword

        elif headWord:   # is a DEFINITION line
            definition += line_stripped + os.linesep

        if endOfEntry:
            headWord, definition = convert_entry_to_html(headWord, definition)
            writeEntry(db, headWord, definition)
            entryCount += 1
            definition = ""
            headWord = nextHeadWord
            endOfEntry = False
            if not entryCount % 1024:
                logCallback("Entries: %d" % entryCount)

    # write last entry
    writeEntry(db, headWord, definition.strip())

    db.commit()
    db.close()

    if DIRTY_CONVERSION:
        logCallback( '''WARNING: Data was lost during converstion!
The selected output encoding "%s" does not support all the characters in the source file!
''')
    elapsedTime = "%02d:%02d" % divmod(int(time.time() - startTime), 60)

    logCallback("Total entries processed: %d   Elapsed time: %s" % (entryCount, elapsedTime))
##    sys.stdout.write ("\rNo of entries processed: %d   Elapsed time: %s" % (entryCount, elapsedTime))
    cleanExit()
    #raw_input("\n\nConversion completed!\nPress <enter> to exit")


################################################################################3
################################################################################3
if __name__ == "__main__":
    args = parseCommandLine()
    INPUT_ENCODING
    OUTPUT_ENCODING

    in_fname = None

    if len(args) == 0:
        Usage()

    if len(args) > 0:
        in_fname = args[0]

    #print 'INPUT_ENCODING  = %s' % INPUT_ENCODING
    #print 'OUTPUT_ENCODING = %s' % OUTPUT_ENCODING

    if len(args) >1:
        out_fname = args[1]

    else:
        out_fname = os.path.splitext (os.path.split(in_fname)[-1])[0] \
        + '_'+OUTPUT_ENCODING + '.HTM'


    convert_dsl(in_fname,out_fname)
    cleanExit()
